This structre of the file is as follows:
This section was made to make the file a little more organized.
import os
import sys
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix
import itertools
import seaborn as sns
from sklearn import decomposition
from mpl_toolkits.mplot3d import Axes3D
from sklearn.metrics import silhouette_score
from sklearn import mixture
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV
import plotly.graph_objects as go
from sklearn import metrics
from sklearn.ensemble import AdaBoostRegressor
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from collections import OrderedDict
from operator import itemgetter
import itertools
# Constants
ROOT_PATH = os.path.abspath(os.getcwd())
SEED = 170
VOTES_THRESHOLD = 135720
# bycode 2018 excel
BYCODE = os.path.join(ROOT_PATH,"bycode2018.xlsx")
# index 2018 excel
INDEX = os.path.join(ROOT_PATH,"index 2018.xlsx")
# sheets in index 2018
NATURAL = "Natural Area"
DISTRICT = "District"
RELIGION = "Religion"
SETTLEMENT_TYPE = "Settlement type"
# elections data
SETTELMENT21 = os.path.join(ROOT_PATH,"21settelments.xlsx")
SETTELMENT22 = os.path.join(ROOT_PATH,"22settelments.xlsx")
SETTELMENT23 = os.path.join(ROOT_PATH,"23settelments.xlsx")
CALPI21 = os.path.join(ROOT_PATH,"21calpi.xlsx")
def inner_join(df_1,df_2,column_name):
"""makes inner-join between dataframes on the specified column"""
df = pd.merge(left=df_1, right=df_2, left_on= column_name, right_on=column_name)
df.drop_duplicates(inplace=True)
return df
def remove_small_parties(df,threshold):
"""remove parties that didnt pass the threshold"""
for column in df.columns[7:]:
if df[column].sum() < threshold:
df = df.drop(column,axis=1)
return df
def unite_parties (df):
""" unites small parties to factions
We think those factions represent the israeli society
"""
d={'United Torah Judaism':'Haredi','Shas':'Haredi', 'Avoda':'Left','Meretz':'Left',
'Consolidation of right-wing parties':'Right','Kolano':'Right','Israel is our Home':'Right','New Right':'Right',
'UAL-Balad':'Arab','Hadash':'Arab' ,'Gesher Avoda':'Left','Joint list':'Arab','right':'Right'
,'Avoda-Meretz-Gesher':'Left'}
faction=['Haredi','Right','Arab','Left']
for f in faction:
df.insert(len(df.columns),f ,0)
for c in df.columns[7:]:
if c in d:
s=df[d[c]]+df[c]
df[d[c]]=s
for c in d.keys():
if c in df.columns:
df=df.drop(c,axis=1)
return df
def normalize_to_voting_ratios(df_original):
"""normalizing the votes according to proportion of votes per party"""
columns = ['Haredi','Right','Arab','Left','Likud','Blue and white']
df = df_original.copy()
df = df[columns].astype(float)
for i,r in df_original.iterrows():
for c in columns:
x = r[c]/r['valid votes']
df.at[i , c]=x
return df[columns]
def add_most_voted_colm(df):
""" adds a column of labels for the most voted faction """
temp = df.copy()
temp = temp[['Haredi','Right','Arab','Left','Likud','Blue and white']]
for c in temp.columns:
temp[c] = pd.to_numeric(temp[c])
colm = temp.idxmax(axis=1)
df["chosen"] = colm
return df
def barchar_group(df,x_var,groupby_var,size,name):
"""creates a barchar that represents aggregation results"""
df_agg = df.loc[:, [x_var, groupby_var]].groupby(groupby_var)
vals = [df[x_var].values.tolist() for i, df in df_agg]
# Draw
plt.figure(figsize=(16,9), dpi= 80)
colors = [plt.cm.Spectral(i/float(len(vals)-1)) for i in range(len(vals))]
n, bins, patches = plt.hist(vals, df[x_var].unique().__len__(), stacked=False, density=False, color=colors[:len(vals)])
# Decoration
plt.legend({group:col for group, col in zip(np.unique(df[groupby_var]).tolist(), colors[:len(vals)])})
plt.title(name, fontsize=22)
plt.xlabel(x_var)
plt.ylabel("Frequency")
plt.ylim(0, size)
plt.xticks(ticks=bins, labels=np.unique(df[x_var]).tolist(), rotation=90, horizontalalignment='left')
plt.show()
def plot_confusion_matrix(cm, classes,
normalize=False,
title='Confusion matrix',
cmap=plt.cm.Oranges):
"""
This function prints and plots the confusion matrix.
Normalization can be applied by setting `normalize=True`.
Source: http://scikit-learn.org/stable/auto_examples/model_selection/plot_confusion_matrix.html
"""
if normalize:
cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
print("Normalized confusion matrix")
else:
print('Confusion matrix, without normalization')
plt.figure(figsize = (10, 10))
plt.imshow(cm, interpolation='nearest', cmap=cmap)
plt.title(title, size = 24)
plt.colorbar(aspect=4)
tick_marks = np.arange(len(classes))
plt.xticks(tick_marks, classes, rotation=45, size = 14)
plt.yticks(tick_marks, classes, size = 14)
fmt = '.2f' if normalize else 'd'
thresh = cm.max() / 2.
# Labeling the plot
for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
plt.text(j, i, format(cm[i, j], fmt), fontsize = 20,
horizontalalignment="center",
color="white" if cm[i, j] > thresh else "black")
plt.grid(None)
plt.tight_layout()
plt.ylabel('True label', size = 18)
plt.xlabel('Predicted label', size = 18)
def pca_conversion(vectors):
""" PCA conversion for visualiztion"""
pca = decomposition.PCA(n_components=3)
pca.fit(vectors)
vectors_vis = pca.transform(vectors)
vectors_vis = pd.DataFrame(vectors_vis,columns = ["x","y","z"])
return vectors_vis
def bestparties(df):
dic = {}
count=0
for c in df.columns[6:]:
count=count+df[c].sum()
for c in df.columns[6:]:
dic[c]=df[c].sum()/count
dic = OrderedDict(sorted(dic.items(), key=itemgetter(1),reverse=True))
l=dic.items()
l=list(l)[:10]
l=sorted(l, key=lambda tup: tup[1])
my_colors = [(x/10.0, x/20.0, 0.75) for x in range(10)]
plt.figure(figsize=(20,10))
plt.barh(*zip(*l),align='center',color=my_colors)
plt.xlabel("% of Total Votes")
plt.ylabel("Top 10 parties")
plt.show()
def biggestCities(df):
colors=['#deebf7','#deebf7','#c6dbef','#9ecae1' ,'#6baed6','#4292c6','#2171b5','#08519c','#08519c','#08306b']
df=df.sort_values(by=['valid votes'])
df=df.tail(n=10)
plt.figure(figsize=(20,10))
plt.barh(df["name"],df['valid votes'],align='center',color=colors)
plt.xlabel("Num of votes")
plt.ylabel("Top 10 Cities")
plt.show()
I will be removing columns and rows that might make my data too noise or just have insufficent infomration about them.
bycode = pd.read_excel(BYCODE,dtype=object)
bycode.drop("year",axis = 1,inplace = True)
bycode.drop("Metropolitan association",axis = 1,inplace = True)
bycode =bycode[bycode["name"].str.contains("NA") == False]
bycode.drop("name",axis=1, inplace = True)
bycode.drop("Status Montzifali",axis=1,inplace=True)
temp=bycode.copy()
temp=temp.astype(float)
corr=temp.corr()
plt.figure(figsize=(20, 20))
sns.heatmap(corr, vmax=.8, linewidths=0.01,
square=True,annot=True,cmap='YlGnBu',linecolor="white")
plt.title('Correlation between features');
We are filling the dataframe with values that we believe are logical and important to make the correct analysis.
Total population in 2018 - column represents the amount of people living in the settelment, we fill the gaps with the median of the values in this column
Arabs/Thereof: Jews/Jews and others - column represents the amount of people living in the settelment by national, the gaps are filled according to settelment type. if Arab settelment then everyone is from arab national if Jewish settelment then everyone is from Jewish national
Height - column represents the height of the settelment, we fill it with most frequent value in the column.
Organizational affiliation code - column represents the organiztion the settelment belongs to, we put a dummie value "19" to fill gaps
Natural area code - column represents the area of the settelemnt, gaps are filled according to the district they belong to
Local authorities code - column represents the authorities that
Religion code - column reprsents the religion of settelment, gaps are filled according to the national of the settelment
# fill established with most frequent year
bycode["Established"] = pd.to_numeric(bycode["Established"],errors='coerce')
bycode["Established"] = bycode["Established"].fillna(bycode["Established"].value_counts().idxmax())
bycode["Established"] = bycode["Established"].astype(int)
#fill total popluation with median population
bycode["Total population in 2018"] = pd.to_numeric(bycode["Total population in 2018"],errors='coerce')
bycode["Total population in 2018"] = bycode["Total population in 2018"].fillna(bycode["Total population in 2018"].median())
bycode["Total population in 2018"] = bycode["Total population in 2018"].astype(int)
#fill arabs, jews, and other
arabs_code = [250,260,270,280,290,440,450,460]
for index,row in bycode.iterrows():
if row['Settlement type code'] in arabs_code and pd.isnull(row['Jews and others']):
bycode.at[index,"Jews and others"] = "0"
bycode.at[index,"Thereof: Jews"] = "0"
bycode.at[index,"Arabs"] = row["Total population in 2018"]
elif row['Settlement type code'] not in arabs_code and pd.isnull(row['Jews and others']):
bycode.at[index,"Jews and others"] = row["Total population in 2018"]
bycode.at[index,"Thereof: Jews"] = row["Total population in 2018"]
bycode.at[index,"Arabs"] = "0"
bycode["Arabs"].fillna("0",inplace = True)
bycode["Thereof: Jews"].fillna("0",inplace = True)
bycode["Jews and others"].fillna("0",inplace = True)
# fill height
bycode["height"].fillna(bycode["height"].value_counts().idxmax(),inplace = True)
#fill Organizational affiliation
bycode["Organizational affiliation code"].fillna("19",inplace = True)
#fill Natural area code
for index,row in bycode.iterrows():
if pd.isnull(row['Natural area code']):
if row["District code"] == 11:
bycode.at[index,"Natural area code"] = "111"
elif row["District code"] in [21,22,23,24,25,29]:
bycode.at[index,"Natural area code"] = "211"
elif row["District code"] == [31,32]:
bycode.at[index,"Natural area code"] = "311"
elif row["District code"] == [41,42,43,44]:
bycode.at[index,"Natural area code"] = "411"
elif row["District code"] == [51,52,53]:
bycode.at[index,"Natural area code"] = "511"
elif row["District code"] == [61,62]:
bycode.at[index,"Natural area code"] = "611"
else:
bycode.at[index,"Natural area code"] = "0"
#fill Local authorities
bycode["Local authorities code"].fillna(0,inplace = True)
# fill Religion
for index,row in bycode.iterrows():
if pd.isnull(row['Religion code']):
if row["Arabs"] == "0":
bycode.at[index,"Religion code"] = "1"
else:
bycode.at[index,"Religion code"] = "2"
bycode.astype(float,copy=False)
bycode.fillna(0,inplace = True)
This excel just translates the codes from bycode.xlsx to relevant labels
df = pd.read_excel(INDEX)
df
This excel contains observations on settelments
bycode
Example for settlements file
All the other features show the number of voters for each party
df = pd.read_excel(SETTELMENT21,dtype=object)
df.drop("name",axis = 1, inplace = True)
df
bestparties(df)
df = pd.read_excel(SETTELMENT21,dtype=object)
biggestCities(df)
df = pd.read_excel(SETTELMENT21,dtype=object)
df.drop(df.tail(1).index,inplace=True)
df['p']=df['Disqualified']/df['Disqualified'].sum()
df=df.sort_values(by=['p'])
df=df[df['p'] > 0.01]
plt.figure(figsize=(20,10))
plt.barh(df["name"],df['p'],align='center')
plt.xlabel("Disqualifition ratio")
plt.ylabel("Cities")
plt.suptitle("Cities with high Disqualifition")
plt.show()
Example for calpi file
All the other features show the number of voters for each party
df = pd.read_excel(CALPI21,dtype=object)
df.drop("name",axis=1,inplace=True)
df
Assumptions used:
This way which we approach this problem is by normalizing the dataframe according to the total number of votes that were made in each settelment. By normalizing we get a vector which represents the behavoiur in which the settelment is voting. In this way a large city like jerusalem can get the same vector as a small kibbutz.
After the normaliziation we start the clustering process and analyze the different clusters created.
index = pd.read_excel(INDEX,sheet_name=SETTLEMENT_TYPE)
df_21 = pd.read_excel(SETTELMENT21,dtype=object)
df_22 = pd.read_excel(SETTELMENT22,dtype=object)
# using assumptions
df_21 = remove_small_parties(df_21,VOTES_THRESHOLD)
df_21 = unite_parties(df_21)
df_21 = add_most_voted_colm(df_21)
df_22 = remove_small_parties(df_22,VOTES_THRESHOLD)
df_22 = unite_parties(df_22)
df_22 = add_most_voted_colm(df_22)
df = pd.concat([df_21, df_22])
# inner joins to add the settelment type to every settelment observation
df = inner_join(df,bycode,"code")
df = inner_join(df,index,"Settlement type code")
df.fillna(0,inplace = True)
df_base = df[["Likud","Blue and white",'Haredi','Right','Arab','Left',"chosen","Settlement type","valid votes"]]
df_base
In the coorelation matrix there is a little signal that show that "Blue and White" correlations are very similar to "Left" correlations.
# preparing dataframe for visualiztion
le = LabelEncoder()
df = df_base.copy()
df['Settlement type'] = le.fit_transform(df['Settlement type'])
df["chosen"] = le.fit_transform(df['chosen'])
df = df.astype(int)
# plot corrleation matrix
corr=df.corr()
plt.figure(figsize=(10, 10))
sns.heatmap(corr, vmax=.8, linewidths=0.01,
square=True,annot=True,cmap='YlGnBu',linecolor="white")
plt.title('Correlation between features');
# normalizing vectors
df_normalized = normalize_to_voting_ratios(df_base)
# Using PCA to convert the vectors to 3D representation
vectors_vis = pca_conversion(df_normalized)
df_normalized
It is possible to see that there are some clusters in this 3D plot
fig = plt.figure(figsize=(20,10))
plt.clf()
ax = Axes3D(fig, rect=[0, 0, .95, 1], elev=48, azim=134)
ax.scatter(vectors_vis["x"], vectors_vis["y"], vectors_vis["z"], cmap=plt.cm.nipy_spectral,
edgecolor='k')
Use case: General-purpose, even cluster size, flat geometry, not too many clusters
Geometry: Distances between points
Evaluation metric:
In case the Elbow method is not informative enough we will base our desicion according Silhouette score.
Sum_of_squared_distances = []
sil = []
K = range(2,6)
# checking best parameters for kmeans
for k in K:
km = KMeans(n_clusters=k,random_state=SEED)
km.fit(df_normalized)
Sum_of_squared_distances.append(km.inertia_)
labels = km.labels_
sil.append(silhouette_score(df_normalized, labels, metric = 'euclidean'))
Elbow method is not informative enough,the angle in which the graph is going is quit consistent
plt.plot(K, Sum_of_squared_distances, 'bx-')
plt.xlabel('k')
plt.ylabel('Sum_of_squared_distances')
plt.title('Elbow Method For Optimal k')
plt.show()
We choose K that gives us the highest score for silhouette
plt.plot(K, sil, 'bx-')
plt.xlabel('k')
plt.ylabel('score')
plt.title('silhouette Score')
plt.show()
In this point we are using kmeans with k=5 and add "Cluster Class" column to our base dataframe and the visualization dataframe.
# training model
km = KMeans(n_clusters=5,random_state=SEED)
km.fit(df_normalized)
y = km.predict(df_normalized)
# creating metrics to evaluate the clusters
kmSill=metrics.silhouette_score(df_normalized , y)
kmCalinski=metrics.calinski_harabasz_score(df_normalized , y)
kmDavies=metrics.davies_bouldin_score(df_normalized , y)
df_base['Cluster Class'] = pd.Series(y, index=df_base.index)
vectors_vis['Cluster Class'] = pd.Series(y, index=df_normalized.index)
df = vectors_vis.groupby('Cluster Class')
fig = plt.figure(figsize=(20,10))
plt.clf()
ax = Axes3D(fig, rect=[0, 0, .95, 1], elev=48, azim=134)
colors = itertools.cycle(["r", "b", "g","y","m"])
for name,cluster in df:
ax.scatter(cluster["x"], cluster["y"], cluster["z"], cmap=plt.cm.nipy_spectral,c=next(colors),
edgecolor='k',label=name)
lgnd = plt.legend(prop={'size': 20})
#change the marker size manually for both lines
for legendHandle in lgnd.legendHandles:
legendHandle._sizes = [400]
We make two aggregation:
barchar_group(df_base,"Cluster Class","chosen",1200,"Representative Party")
barchar_group(df_base,"Cluster Class","Settlement type",550, "Settlement type")
The kmeans gave us nice insights, specially on the Kibbutzim and moshavim. In the first graph you can see quit clearly that in one of the clusters most of the settelments voted mainly to "Blue and White" party and to the "Left" parties. In the second graph you can see that most of the Kibbutzim belong to the same cluster as well and a big number of moshavim.
Those two fact suggest that Kibbutzim and moshavim vote mainly to "Blue and White" and "Left" parties.
Use case: Flat geometry, good for density estimation
Geometry: Mahalanobis distances to centers
Evaluation metric:
If BIC is not informative enough, we will use Silhouette score
evluate_bic = {}
evluate_sil = {}
n_components_range = range(2, 6)
cv_types = ['spherical', 'tied', 'diag', 'full']
# checking what are the best parameters for GMM
for cv_type in cv_types:
bic = []
sil = []
for n_components in n_components_range:
# Fit a Gaussian mixture with EM
gmm = mixture.GaussianMixture(n_components=n_components,
covariance_type=cv_type,random_state=SEED)
gmm.fit(df_normalized)
bic.append(gmm.bic(df_normalized))
labels = gmm.predict(df_normalized)
sil.append(silhouette_score(df_normalized, labels, metric = 'euclidean'))
evluate_bic[cv_type] = bic
evluate_sil[cv_type] = sil
# ploting metrics
for key in evluate_bic:
plt.plot(n_components_range, evluate_bic[key],label=key) # plotting t, c separately
plt.xlabel("components")
plt.ylabel("Bayesian information criterion")
plt.legend(loc=1)
plt.show()
for key in evluate_sil:
plt.plot(n_components_range, evluate_sil[key],label=key) # plotting t, c separately
plt.xlabel("components")
plt.ylabel("silhouette_score")
plt.legend(loc=1)
plt.show()
According to BIC, it looks like that components=5 and covariance_type = full
# training the models
gmm = mixture.GaussianMixture(n_components=5,covariance_type="full",random_state=SEED)
gmm.fit(df_normalized)
y = gmm.predict(df_normalized)
# creating metrics for evaluation
gmSill=metrics.silhouette_score(df_normalized , y)
gmCalinski=metrics.calinski_harabasz_score(df_normalized , y)
gmDavies=metrics.davies_bouldin_score(df_normalized , y)
df_base['Cluster Class'] = pd.Series(y, index=df_base.index)
vectors_vis['Cluster Class'] = pd.Series(y, index=df_normalized.index)
df = vectors_vis.groupby('Cluster Class')
fig = plt.figure(figsize=(20,10))
plt.clf()
ax = Axes3D(fig, rect=[0, 0, .95, 1], elev=48, azim=134)
colors = itertools.cycle(["r", "b", "g","y","m"])
for name,cluster in df:
ax.scatter(cluster["x"], cluster["y"], cluster["z"], cmap=plt.cm.nipy_spectral,c=next(colors),
edgecolor='k',label=name)
lgnd = plt.legend(prop={'size': 20})
#change the marker size manually for both lines
for legendHandle in lgnd.legendHandles:
legendHandle._sizes = [400]
We make two aggregation:
barchar_group(df_base,"Cluster Class","chosen",1200,"Representative Party")
barchar_group(df_base,"Cluster Class","Settlement type",500,"Settlement type")
GMM as well gives us some insights. In one of the clusters in the first graph there is high concentration of "Blue and White" and "Left" parties. In the second graph you can see that Kibbutzim are mainly concentrated in the same cluster.
So GMM suggests that most of the kibbutzim are voting mainly for "Blue and white".
In this case kmeans preformed better then GMM. Kmeans managed to identify small clusters in the dataset, for example the "Haredi" party and distinguish them from the other clusters. While GMM mixed "Haredi" party with other parties.
Why kmeans preformed better? One reason could be that GMM flexibility, which means that he is more tolerant to the distance of the data from the cluster and another reason could be that GMM work better if the data in each feature are Normal distributed.
in the output bellow we present a 3 scores approaches:
Davies-Bouldin score ,it is defined as a ratio between the cluster scatter and the cluster’s separation and a lower value will mean that the clustering is better.
*in the comparation between kmeans and gmm , kmeans best gmm in all 3 scores.
print("kmean sillhoutte score : "+str(kmSill) )
print("gmm sillhoutte score : "+str(gmSill) )
print("")
print("kmean Calinski-Harabasz score : "+str(kmCalinski))
print("gmm Calinski-Harabasz score : "+str(gmCalinski))
print("")
print("kmean Davies score : "+str(kmDavies))
print("gmm Davies score : "+str(gmDavies))
param_grid = {
'n_estimators': [100, 150, 200, 250], # The number of trees in the forest.
'max_depth': [None, 50, 60, 70], # The maximum depth of the tree.
'max_features': ['sqrt', None,'log2'], # he number of features to consider when looking for the best split
'min_samples_split': [2, 5, 10], # The minimum number of samples required to split an internal node
'bootstrap': [True, False] # Whether bootstrap samples are used when building trees.
}
df21 = pd.read_excel(SETTELMENT21)
df22 = pd.read_excel(SETTELMENT22)
df23 = pd.read_excel(SETTELMENT23)
features = np.delete(bycode.columns,np.where(bycode.columns== "code"))
target = "valid votes"
df21 = inner_join(bycode,df21,"code")
df22 = inner_join(bycode,df22,"code")
df23 = inner_join(bycode,df23,"code")
df_train = pd.concat([df21, df22])
TEST=["HAIFA","ELAT","AYYELET HASHAHAR","SAKHNIN","QAZRIN"]
df_test = df23[df23['name'].isin(TEST)]
locations=df_test['name'].values.tolist()
# preparing data for training and testing
x_train = df_train[features]
y_train = df_train[target]
x_test = df_test[features]
y_test = df_test[target]
# selecting features according to importance and transformig data accordingly
sfm = SelectFromModel(RandomForestRegressor())
sfm.fit(x_train, y_train)
x_train = sfm.transform(x_train)
x_test = sfm.transform(x_test)
# trainning model and testing
rs = GridSearchCV(RandomForestRegressor(random_state=SEED), param_grid, n_jobs=-1,
scoring='neg_mean_absolute_error', cv=5,
verbose=1)
rs.fit(x_train,y_train)
model = rs.best_estimator_
predict_y = rs.predict(x_test)
print("MAE score: "+str(metrics.mean_absolute_error(predict_y,y_test)))
print("features used:")
print(features[sfm.get_support()].values)
pred = pd.DataFrame(predict_y,columns=["prediction"],index = locations)
test = pd.DataFrame(y_test.values,columns=["actual"],index = locations)
results = pd.concat([pred,test], axis=1)
results
fig = go.Figure(data=[
go.Bar(name='Test', x=results.index, y=results['actual'], textposition='auto',
marker_color='rgb(255,51,51)', marker_line_color='rgb(8,48,107)',marker_line_width=1, opacity=0.6),
go.Bar(name='Pred', x=results.index, y=results['prediction'], textposition='auto',
marker_color='rgb(102,178,255)', marker_line_color='rgb(30,70,70)',marker_line_width=1, opacity=0.6)
])
# Change the bar mode
fig.update_layout(barmode='group', xaxis_tickangle=-45)
fig.update_layout(title_text='Comparison between prediction and actual data', xaxis_title="Settlements",
yaxis_title="Number of votes",)
fig.show()
adb_param_grid = {'n_estimators':[50,100,150,200,250], #Number of weak learners to train iteratively.,
'learning_rate':[0.001, 0.01, 0.1, 1], #It contributes to the weights of weak learners. It uses 1 as a default value.,
'random_state': [1]}
# preparing data
df21 = pd.read_excel(SETTELMENT21)
df21 = remove_small_parties(df21,135720)
df21 = unite_parties(df21)
df22 = pd.read_excel(SETTELMENT22)
df22 = remove_small_parties(df22,135720)
df22 = unite_parties(df22)
df23 = pd.read_excel(SETTELMENT23)
df23 = remove_small_parties(df23,135720)
df23 = unite_parties(df23)
features = np.delete(bycode.columns,np.where(bycode.columns== "code"))
df21 = inner_join(bycode,df21,"code")
df22 = inner_join(bycode,df22,"code")
df23 = inner_join(bycode,df23,"code")
df_train = pd.concat([df21, df22])
cities=["JERUSALEM","BENE BERAQ","SAKHNIN","KARMI'EL","DALIYAT AL-KARMEL"]
df_test = df23[df23.name.isin(cities)]
# model for likud
target = "Likud"
x_train = df_train[features]
y_train = df_train[target]
x_test = df_test[features]
y_test = df_test[target]
# choosing features
sfm = SelectFromModel(AdaBoostRegressor())
sfm.fit(x_train, y_train)
x_train = sfm.transform(x_train)
x_test = sfm.transform(x_test)
gsADB = GridSearchCV(AdaBoostRegressor(random_state=SEED),param_grid = adb_param_grid, cv=5, scoring="neg_mean_absolute_error", n_jobs= -1)
gsADB.fit(x_train,y_train)
adb = gsADB.best_estimator_
y_predict = adb.predict(x_test)
print("MAE score: "+str(metrics.mean_absolute_error(y_predict,y_test)))
print("features used:")
print(features[sfm.get_support()].values)
pred = pd.DataFrame(y_predict,columns=["prediction"],index = df_test["name"])
test = pd.DataFrame(y_test.values,columns=["actual"],index = df_test["name"])
results = pd.concat([pred,test], axis=1)
results
fig = go.Figure(data=[
go.Bar(name='Test', x=results.index, y=results['actual'], textposition='auto',
marker_color='rgb(255,51,51)', marker_line_color='rgb(8,48,107)',marker_line_width=1, opacity=0.6),
go.Bar(name='Pred', x=results.index, y=results['prediction'], textposition='auto',
marker_color='rgb(102,178,255)', marker_line_color='rgb(30,70,70)',marker_line_width=1, opacity=0.6)
])
# Change the bar mode
fig.update_layout(barmode='group', xaxis_tickangle=-45)
fig.update_layout(title_text='Comparison between prediction and actual data',
yaxis_title="Number of votes",)
fig.show()
# model for Blue and White
target = "Blue and white"
x_train = df_train[features]
y_train = df_train[target]
x_test = df_test[features]
y_test = df_test[target]
# feature selection
sfm = SelectFromModel(AdaBoostRegressor())
sfm.fit(x_train, y_train)
x_train = sfm.transform(x_train)
x_test = sfm.transform(x_test)
# training model with cross-validation
gsADB = GridSearchCV(AdaBoostRegressor(random_state=SEED),param_grid = adb_param_grid, cv=5, scoring="neg_mean_absolute_error", n_jobs= -1)
gsADB.fit(x_train,y_train)
adb = gsADB.best_estimator_
y_predict = adb.predict(x_test)
print("MAE score: "+str(metrics.mean_absolute_error(y_predict,y_test)))
print("features used:")
print(features[sfm.get_support()].values)
pred = pd.DataFrame(y_predict,columns=["prediction"],index = df_test["name"])
test = pd.DataFrame(y_test.values,columns=["actual"],index = df_test["name"])
results = pd.concat([pred,test], axis=1)
results
fig = go.Figure(data=[
go.Bar(name='Test', x=results.index, y=results['actual'], textposition='auto',
marker_color='rgb(255,51,51)', marker_line_color='rgb(8,48,107)',marker_line_width=1, opacity=0.6),
go.Bar(name='Pred', x=results.index, y=results['prediction'], textposition='auto',
marker_color='rgb(102,178,255)', marker_line_color='rgb(30,70,70)',marker_line_width=1, opacity=0.6)
])
# Change the bar mode
fig.update_layout(barmode='group', xaxis_tickangle=-45)
fig.update_layout(title_text='Comparison between prediction and actual data',
yaxis_title="Number of votes",)
fig.show()
# model for Joint List
target = "Arabs"
x_train = df_train[features]
y_train = df_train[target]
x_test = df_test[features]
y_test = df_test[target]
# feature selection
sfm = SelectFromModel(AdaBoostRegressor())
sfm.fit(x_train, y_train)
x_train = sfm.transform(x_train)
x_test = sfm.transform(x_test)
# training model with cross-validation
gsADB = GridSearchCV(AdaBoostRegressor(random_state=SEED),param_grid = adb_param_grid, cv=5, scoring="neg_mean_absolute_error", n_jobs= -1)
gsADB.fit(x_train,y_train)
adb = gsADB.best_estimator_
y_predict = adb.predict(x_test)
print("MAE score: "+str(metrics.mean_absolute_error(y_predict,y_test)))
print("features used:")
print(features[sfm.get_support()].values)
pred = pd.DataFrame(y_predict,columns=["prediction"],index = df_test["name"])
test = pd.DataFrame(y_test.values,columns=["actual"],index = df_test["name"])
results = pd.concat([pred,test], axis=1)
results
fig = go.Figure(data=[
go.Bar(name='Test', x=results.index, y=results['actual'], textposition='auto',
marker_color='rgb(255,51,51)', marker_line_color='rgb(8,48,107)',marker_line_width=1, opacity=0.6),
go.Bar(name='Pred', x=results.index, y=results['prediction'], textposition='auto',
marker_color='rgb(102,178,255)', marker_line_color='rgb(30,70,70)',marker_line_width=1, opacity=0.6)
])
# Change the bar mode
fig.update_layout(barmode='group', xaxis_tickangle=-45)
fig.update_layout(title_text='Comparison between prediction and actual data',
yaxis_title="Number of votes",)
fig.show()
# Hyperparameter grid
param_grid = {
'n_estimators': [100, 150, 200, 250],#The number of trees in the forest.
'max_depth': [None, 50, 60, 70] ,#The maximum depth of the tree.
'max_features': ['sqrt', None],#he number of features to consider when looking for the best split
'min_samples_split': [2, 5, 10],#The minimum number of samples required to split an internal node
'bootstrap': [True, False]#Whether bootstrap samples are used when building trees.
}
features = np.delete(bycode.columns,np.where(bycode.columns== "code"))
target = "class"
columns = np.append(features,target)
def label(row,total):
"""checks if the settelment has high amount of disqualified votes"""
prop = row["Disqualified"] / total
val = 0
if prop > 0.01:
val = 1
return val
def add_class(df):
"""adds a class to each observation in the dataframe """
settlment_type = pd.read_excel(INDEX,sheet_name=SETTLEMENT_TYPE)
total = df["Disqualified"].sum()
df['class'] = df.apply(lambda row : label(row, total), axis = 1)
df = inner_join(df,bycode,"code")
df = inner_join(df,settlment_type,"Settlement type code")
df['Settlement type'] = le.fit_transform(df['Settlement type'])
return df ,df[columns]
df21 = pd.read_excel(SETTELMENT21)
df22 = pd.read_excel(SETTELMENT22)
df23 = pd.read_excel(SETTELMENT23)
_,df21 = add_class(df21)
_,df22 = add_class(df22)
df, df23 = add_class(df23)
df_train=pd.concat([df21, df22])
y_train = df_train[target]
x_train= df_train[features]
y_test = df23[target].copy()
x_test = df23[features].copy()
# Estimator for use in random search
rs = RandomizedSearchCV(RandomForestClassifier(), param_grid, n_jobs = -1,
scoring = 'accuracy', cv = 5,
n_iter = 1, verbose = 1, random_state=SEED)
rs.fit(x_train,y_train)
# choosign best features for the modle
sfm = SelectFromModel(rs.estimator)
sfm.fit(x_train, y_train)
x_train = sfm.transform(x_train)
x_test = sfm.transform(x_test)
# Create the random search model
rs = RandomizedSearchCV(RandomForestClassifier(random_state = SEED), param_grid, n_jobs = -1,
scoring = 'accuracy', cv = 5,
n_iter = 1, verbose = 1, random_state=SEED)
rs.fit(x_train,y_train)
y_predict = rs.predict(x_test)
print("accuracy: "+str(metrics.accuracy_score(y_predict,y_test)))
print("features used:")
print(features[sfm.get_support()].values)
cm = confusion_matrix(y_test, y_predict)
plot_confusion_matrix(cm, classes = ['0', '1'],
title = 'Health Confusion Matrix')
df["predict"] = y_predict
df=df[df.predict == 1]
print(df.name)